global data ""  // path for source data
global temp ""  // path for intermediary data
global final ""  // path for final replication sample
global tables ""  // path for exporting tables
global figures ""  // path for exporting figures


/*------------------------------------------------------------------------------
This do file processes the raw census data (1990 and 2000 Population Censuses, 
2004 Economic Census and 2005 mini-Census) to generate the cleaned data for 
empirical analysis.

Input:
	Census90.dta: 1990 Population Census;
	Census00.dta: 2000 Population Census;
	Census04.dta: 2004 Economic Census;
	Census05.dta: 2005 Population mini-Census.
	match_00_04.dta: Manually compiled match of the county IDs in 2000 and 2004;
	match_90_00.dta: Manually compiled match of the county IDs in 1990 and 2000;
	county_controls.dta: County-level characteristics of 3,219 counties, 
		including county government expenditure, GDP and population in 2000, 
		and county agricultural productivity in 1993.
	famine-intensity-estc.dta: Famine intensity for 2,727 counties.

Output:
	Census05_sample.dta: To replicate Tables 1, S4-S5;
	Op-cond-cor.dta: To replicate Table S6 and Figure 2;
	Census00_sample.dta: To replicate Table S12 column (c).
------------------------------------------------------------------------------*/
/*------------------------------------------------------------------------------
Generate control variables (migration, shares of primary and secondary 
industries, share of large enterprises) for Table S4 columns (f)-(j)
------------------------------------------------------------------------------*/
* migration
use "$data/Census90.dta", clear
keep id r03 r041 r061 r071
format id %20.0g
gen uid1990=int(id/(10^12))
drop id	
rename r03 sex
rename r041 birthyr
rename r061 reside
rename r071 reside85
	
replace birthyr=birthyr+1000
drop if birthyr < 1900  
gen age = 1990 - birthyr  
gen male = (sex==1)
gen migrate_in = (reside!=1)
gen migrate_out = (reside85>2)
gen popn = 1
collapse (sum) migrate_in migrate_out popn, by(uid1990)
gen migrate_in_r = migrate_in/popn
gen migrate_out_r = migrate_out/popn

merge 1:m uid1990 using "$data/match_90_00.dta"
keep if _m==3
drop _m
drop uid1990 change migrate_out_r popn migrate_out migrate_in
save "$temp/migration.dta" , replace

* shares of primary and secondary industries
use "$data/Census90.dta", clear
keep id r03 r041 r061 r071 r092 r11
format id %20.0g
gen uid1990=int(id/(10^12))
drop id	
rename r03 sex
rename r041 birthyr
rename r061 reside
rename r071 reside85
rename r092 edu2
rename r11 occupation

keep if reside85 == 1 & reside == 1 
replace birthyr=birthyr+1000
drop if birthyr < 1900  
gen age = 1990 - birthyr 
drop if age < 16 | edu2 == 1  
gen male = (sex==1)

gen byte industry1 = (occupation>=611) & (occupation<=650)
gen byte industry2 = (occupation>=711) & (occupation<=949)
replace industry2 = 0 if (occupation>=861) & (occupation<=884)
replace industry2 = 0 if (occupation>=901) & (occupation<=909)
gen popn = 1
collapse (sum) ind1_cnt = industry1 ind2_cnt = industry2 popn, by(uid1990)
gen ind1_r = ind1_cnt/popn
gen ind2_r = ind2_cnt/popn
keep uid1990 ind1_r ind2_r

merge 1:m uid1990 using "$data/match_90_00.dta"	
keep if _m==3
drop _m
keep uid2000 ind1_r ind2_r

save "$temp/industry.dta" , replace

* share of large enterprises in total employment
use "$data/Census04.dta" , clear

sort xzq
gen uid2004 = int(xzq/1000000)
gen private = (djzclx>=171) & (djzclx<=174)
gen byte private_large = 0
replace private_large = 1 if private == 1 & kysjn >=1995 & !mi(kysjn) & cyrs > 100 
collapse (mean) priv_large = private_large , by(uid)
label var priv_large "Share of large enterprises (>100 workders)"

merge 1:m uid2004 using "$data/match_00_04"
drop if _m == 2 
drop _m
drop if uid2000== .
drop uid2004

save "$temp/privlarge.dta" , replace

/*------------------------------------------------------------------------------
Clean the 2005 Census data, and merge with famine intensity and other control 
variables
------------------------------------------------------------------------------*/
use "$data/Census05.dta", clear
rename r3 sex
rename r4_01 birthyr
rename r5 race
rename r11 huko
rename r24 income
rename r18 schst
rename r22 ownership
rename r23 clswk
gen uid2004 = int(xzq/1000000)
keep uid2004 sex birthyr race r6 huko income schst ownership clswk
merge m:m uid using "$data/match_00_04"
drop if _m == 2 
drop _m

gen byte male = (sex== 1)
gen age = 2005-birthyr
gen byte han = race== 1 
gen byte local = 1 if r6 == 1 | r6 == 2
gen byte rural = (huko == 1)
	
gen areaid = mod(uid2004,100) 
gen byte city = 0 if !mi(areaid)
replace city = 1 if areaid < 21 | areaid>=81 
drop areaid 

keep if local == 1 
drop if age < 16 | schst == 1 
gen byte private = 0 
replace private = 1 if ownership==5 | ownership == 6   
gen byte private_boss1 = 0
replace private_boss1 = 1 if (clswk == 2 | clswk == 3)& private ==1   
gen byte private_boss2 = 0 
replace private_boss2 = 1 if private == 1  & clswk == 2 
	
drop if birthyr < 1900 

merge m:1 uid2000 using "$data/county_controls"
drop if _m == 2 
drop _m 
gen gdpc = gdp/popn

merge m:1 uid2000 using "$temp/migration.dta"
drop if _m==2
drop _m

merge m:1 uid2000 using "$temp/privlarge.dta"
drop if _m==2
drop _m

merge m:1 uid2000 using "$temp/industry.dta"
drop if _m==2
drop _m

merge m:1 uid2000 using "$final/famine-intensity-estc.dta"
keep if _m == 3
drop _m

gen prov = int(uid2000/10000)
gen region6=1 if inrange(prov,11,15)
replace region6=2 if inrange(prov,21,23)
replace region6=3 if inrange(prov,31,37)
replace region6=4 if inrange(prov,41,46)
replace region6=5 if inrange(prov,50,54)
replace region6=6 if inrange(prov,61,65)
replace region6=1 if inrange(prov,32,33)
replace region6=4 if inrange(prov,31,31)
replace region6=3 if prov==44
replace region6=6 if prov==14

gen byte group1=(birthyr>=1900 & birthyr<=1940)
label var group1 "Born(1900-40)"
forvalues i = 2/11 {
  local j = 35 + `i'*3
  local k = 37 + `i'*3
  gen byte group`i' = (birthyr>=19`j' & birthyr<=19`k')
  label var group`i' "Born(19`j'-`k')"
}

gen income_ln = ln(income)

su rltv_death_rate_n_1990 if birthyr <=1961, d
global death_25 : display %5.3f r(p25)
global death_50 : display %5.3f r(p50)
global death_75 : display %5.3f r(p75) 
gen q1 = (rltv_death_rate_n_1990 <= $death_25) if birthyr <=1961
gen q2 = (rltv_death_rate_n_1990 > $death_25) & (rltv_death_rate_n_1990 <= $death_50) if birthyr <=1961
gen q3 = (rltv_death_rate_n_1990 > $death_50) & (rltv_death_rate_n_1990 <= $death_75) if birthyr <=1961
gen q4 = (rltv_death_rate_n_1990 > $death_75) if birthyr <=1961
gen famine_n_q1 = rltv_death_rate_n_1990*q1 if birthyr <=1961
gen famine_n_q2 = rltv_death_rate_n_1990*q2 if birthyr <=1961
gen famine_n_q3 = rltv_death_rate_n_1990*q3 if birthyr <=1961
gen famine_n_q4 = rltv_death_rate_n_1990*q4 if birthyr <=1961
drop q1 q2 q3 q4

su rltv_death_rate_n_1990 if birthyr <=1961 & male==0 , d
global death_25 : display %5.3f r(p25)
global death_50 : display %5.3f r(p50)
global death_75 : display %5.3f r(p75) 
gen q1 = (rltv_death_rate_n_1990 <= $death_25) if birthyr <=1961 & male==0
gen q2 = (rltv_death_rate_n_1990 > $death_25) & (rltv_death_rate_n_1990 <= $death_50) if birthyr <=1961 & male==0
gen q3 = (rltv_death_rate_n_1990 > $death_50) & (rltv_death_rate_n_1990 <= $death_75) if birthyr <=1961 & male==0
gen q4 = (rltv_death_rate_n_1990 > $death_75) if birthyr <=1961 & male==0
gen famine_f_q1 = rltv_death_rate_n_1990*q1 if birthyr <=1961 & male==0
gen famine_f_q2 = rltv_death_rate_n_1990*q2 if birthyr <=1961 & male==0
gen famine_f_q3 = rltv_death_rate_n_1990*q3 if birthyr <=1961 & male==0
gen famine_f_q4 = rltv_death_rate_n_1990*q4 if birthyr <=1961 & male==0
drop q1 q2 q3 q4

su rltv_death_rate_n_1990 if birthyr <=1961 & male==1 , d
global death_25 : display %5.3f r(p25)
global death_50 : display %5.3f r(p50)
global death_75 : display %5.3f r(p75) 
gen q1 = (rltv_death_rate_n_1990 <= $death_25) if birthyr <=1961 & male==1
gen q2 = (rltv_death_rate_n_1990 > $death_25) & (rltv_death_rate_n_1990 <= $death_50) if birthyr <=1961 & male==1
gen q3 = (rltv_death_rate_n_1990 > $death_50) & (rltv_death_rate_n_1990 <= $death_75) if birthyr <=1961 & male==1
gen q4 = (rltv_death_rate_n_1990 > $death_75) if birthyr <=1961 & male==1
gen famine_m_q1 = rltv_death_rate_n_1990*q1 if birthyr <=1961 & male==1
gen famine_m_q2 = rltv_death_rate_n_1990*q2 if birthyr <=1961 & male==1
gen famine_m_q3 = rltv_death_rate_n_1990*q3 if birthyr <=1961 & male==1
gen famine_m_q4 = rltv_death_rate_n_1990*q4 if birthyr <=1961 & male==1
drop q1 q2 q3 q4

drop r6 huko local schst ownership clswk uid2004 private income
	
label var private_boss1 "Owner or self-employed"
label var private_boss2 "Owner"
compress 
save "$final/Census05_sample.dta", replace

/*------------------------------------------------------------------------------
Projection of cohort size and total population for 1947-1967 using 1990 Census
------------------------------------------------------------------------------*/

use "$data/Census90.dta", clear  
keep id r03 r041 r061 r071
format id %20.0g
gen uid1990=int(id/(10^12))
drop id	
rename r03 sex
rename r041 birthyr
rename r061 reside
rename r071 reside85
keep if reside85 == 1 & reside == 1 
drop reside85 reside 

replace birthyr = birthyr + 1000
drop if birthyr < 1947
drop if birthyr > 1970
rename birthyr year
label var year "Year"

gen byte count = 1 
gen byte count_m = 1 if sex == 1 
collapse (sum) n_1990 = count m_1990 = count_m , by(uid1990 year) 
gen f_1990 = n_1990 - m_1990
label var n_1990 "Cohort size (1990)"
label var m_1990 "Cohort size (male)"
label var f_1990 "Cohort size (female)" 

bysort uid: gen count = _N
drop if count !=24 
drop count

foreach i in n_1990 m_1990 f_1990 {
	gen pred_`i' = . 
	gen pred1_`i' = . 
	quietly levelsof uid1990, local(counties) 
	foreach x of local counties {	
		quietly reg `i' year if uid1990 == `x' & (year <= 1957 | year > 1962)
		predict temp_pred_`i', xb
		replace pred_`i' = temp_pred_`i' if uid1990 == `x'
		drop temp_pred*   
	}
}
	
drop if year > 1967
gen byte cohort = . 

forvalues i = 0/6 {
	local j = 1947 + 3*`i'
	local k = 1947 + 3*`i'+3 
	replace cohort = `j' if year >= `j' & year < `k'
} 

drop if cohort == . 
collapse (sum) pred_n_1990 pred_m_1990 pred_f_1990 , by(uid1990 cohort)
label var pred_n_1990 "Projected cohort size (all) [1947-1957, 1963-1970]"
label var pred_m_1990 "Projected cohort size (male) [1947-1957, 1963-1970]"
label var pred_f_1990 "Projected cohort size (female) [1947-1957, 1963-1970]"

foreach i in n_1990 m_1990 f_1990 {
	bysort uid1990: egen population_ttl_`i' = total(pred_`i') if cohort < 1962  
}
label var population_ttl_n_1990 "Projected population size for cohort 1947-61 (all) [1947-1957, 1963-1970]"
label var population_ttl_m_1990 "Projected population size for cohort 1947-61 (male) [1947-1957, 1963-1970]"
label var population_ttl_f_1990 "Projected population size for cohort 1947-61 (female) [1947-1957, 1963-1970]"

merge m:m uid1990 using "$data/match_90_00.dta"
keep if _m==3
drop _m uid1990 change

save "$temp/population.dta", replace

/*------------------------------------------------------------------------------
For Table S6. Calculate size of entrepreneurs at county-cohort level, and merge 
with the respective cohort size. 
------------------------------------------------------------------------------*/
use "$final/Census05_sample.dta" , clear

gen byte group = 12 
forvalues i = 1/11 {
	replace group = `i' if group`i' == 1
}
recode group (12 = 1971 "1971 onwards")(11 = 1968 "1968-70")(10 = 1965 "1965-67")///
(9 = 1962 "1962-64")(8 = 1959 "1959-61")(7 = 1956 "1956-58")  ///
(6 = 1953 "1953-55") (5 = 1950 "1950-52") (4 = 1947 "1947-49") ///
(3 = 1944 "1944-46") (2 = 1941 "1941-43") (1 = 1938 "Before 1940") , gen(cohort) 
label var cohort "Birth cohort" 

gen private_boss1_m = private_boss1 if sex == 1 
gen private_boss1_f = private_boss1 if sex == 2

drop if cohort < 1947  

bysort uid2000: egen private_boss1_ttl = total(private_boss1)
bysort uid2000: egen private_boss1_ttl_m = total(private_boss1_m)
bysort uid2000: egen private_boss1_ttl_f = total(private_boss1_f)

gen female = -(male-1)

collapse (sum) private_boss1 private_boss1_m private_boss1_f ///
	male female, by(uid2000 power_2 rltv_death_rate_*_1990 ///
	private_boss1_t* group cohort region6)  

rename private_boss1 private_boss1_cor
rename private_boss1_m private_boss1_m_cor
rename private_boss1_f private_boss1_f_cor
rename male population_cor_m
rename female population_cor_f

drop if mi(uid2000) 
merge 1:1 uid2000 cohort using "$temp/population.dta"
drop if _m ==2 
drop _m 
compress 

gen private_n_ttl_ln = ln(private_boss1_ttl)
gen private_m_ttl_ln = ln(private_boss1_ttl_m)
gen private_f_ttl_ln = ln(private_boss1_ttl_f)

gen popn_n_ttl_ln = ln(population_ttl_n_1990)
gen popn_m_ttl_ln = ln(population_ttl_m_1990)
gen popn_f_ttl_ln = ln(population_ttl_f_1990)

label var private_n_ttl_ln "Total number of owner or self-employed (ln) (1947-61)"
label var private_m_ttl_ln "Total number of male owner or self-employed (ln) (1947-61)"
label var private_f_ttl_ln "Total number of female owner or self-employed (ln) (1947-61)"
label var popn_n_ttl_ln "Projected total population (ln) (1947-61)"
label var popn_m_ttl_ln "Projected total male population (ln) (1947-61)"
label var popn_f_ttl_ln "Projected total female population (ln) (1947-61)"

gen private_n_cor_ln = ln(private_boss1_cor)
gen private_m_cor_ln = ln(private_boss1_m_cor)
gen private_f_cor_ln = ln(private_boss1_f_cor)
gen pred_n_cor_ln = ln(pred_n_1990)
gen pred_m_cor_ln = ln(pred_m_1990)
gen pred_f_cor_ln = ln(pred_f_1990)

gen missing_n = (private_n_cor_ln == . | pred_n_cor_ln == .)
bysort uid2000: egen mis_n = max(missing_n)
gen missing_m = (private_m_cor_ln == . | pred_m_cor_ln == .)
bysort uid2000: egen mis_m = max(missing_m)
gen missing_f = (private_f_cor_ln == . | pred_f_cor_ln == .)
bysort uid2000: egen mis_f = max(missing_f)
drop missing_*

label var private_n_cor_ln "Cohort size of owner or self-employed (ln)"
label var private_m_cor_ln "Cohort size of male owner or self-employed (ln)"
label var private_f_cor_ln "Cohort size of female owner or self-employed (ln)"
label var pred_n_cor_ln "Projected cohort size (all) (ln)"
label var pred_m_cor_ln "Projected cohort size (male) (ln)"
label var pred_f_cor_ln "Projected cohort size (female) (ln)"

drop private_boss1_* population_* pred_*_1990 group

save "$final/Op-cond-cor.dta" , replace

/*------------------------------------------------------------------------------
For Table S12. Calculate size of police population in 2000 Census at county 
level.
------------------------------------------------------------------------------*/
use "$data/Census00.dta",replace  
gen uid2000 = int(id/10^12)
gen police = 1 if r20 == 321 | r20 == 322 
gen flag = 1
collapse (sum) police popn=flag , by(uid2000) 

merge m:1 uid2000 using "$final/famine-intensity-estc.dta"
drop if _m ==2
drop _merge

merge 1:1 uid2000 using "$temp/match_station.dta"
keep if _m==3
drop _m

gen police_ln = ln(police)
gen popn_ln = ln(popn)
label var popn_ln "Population (ln)"
label var police_ln "Police (ln)"

keep police_ln rltv_death_rate_n_1990 popn_ln stationid police

save "$final/Census00_sample.dta",replace
